# clear env
rm(list=ls())

# Load packages
library(car)
library(tidyverse)
library(MASS)
library(DiscriMiner)
library(klaR)
#library(aplpack)
library(fpc)
library(cluster)
library(ape)
library(amap)

# Packages pertinent to Ordination
library(vegan) # eggplant
#library(vegan3d)
library(mgcv)
#library(rgl)
library(dplyr)
library(magrittr)

Contributors

Evan Collins ()

Kelly Farley ()

Ken Stier ()

The Dataset

Raw dataset: COVID-19 infection and death statistics from U.S. counties (sourced from NYT), combined with economic, education, and population data (sourced from various government agencies) and also survey responses about mask-wearing frequencies (sourced from NYT). 3141 complete observations on 19 metric variables and 6 categorical variables. To avoid any outliers due to population size differences between counties, all variables are scaled as a percentage of population. Variable descriptions can be found here.

Data of relevance for this pset:

This pset only explores the 67 counties in the best state, Florida.

We look at five continuous variables describing each county: Always_Wear_Mask_Survey, Median_Household_Income_Percent_of_State_Total_2019, Percent_Poverty_2019, Percent_Adults_Less_Than_HS, and Covid_Confirmed_Cases_as_pct. Note that Always_Wear_Mask_Survey and Covid_Confirmed_Cases_as_pct were multiplied by 100 to convert the value from a fraction to percent value (like the other variables). This data is stored in data_ord_base.

For additional continuous variables, we make an environmental dataset. We look at five additional continuous variables describing each county: Unemployment_Rate_2019, Death_Rate_2019, Birth_Rate_2019, Civilian_Labor_Force_2019_as_pct, Percent_Adults_Bachelors_or_Higher. Note that Civilian_Labor_Force_2019_as_pct was multiplied by 100 to convert the value from a fraction to percent value (like the other variables). This data is stored in data_ord_env.

# Process raw dataset as we have done in preceding psets

raw <- read.csv("https://evancollins.com/covid_and_demographics.csv")

# create categorical variables: rural-urban code (3 levels), region (4 variables)

# log transformations of our continuous variables
raw$logMedian_Household_Income_2019 <- log(raw$Median_Household_Income_2019 + 0.0001)
raw$logPercent_Poverty_2019 <- log(raw$Percent_Poverty_2019 + 0.0001)
raw$logCovid_Confirmed_Cases_as_pct <- log(raw$Covid_Confirmed_Cases_as_pct + 0.0001)
# Base dataset of interest for this pset - data_ord_base
data_ord <- raw[raw$State_Name=="Florida", ]
data_ord <- data_ord[, c(2, 9, 12, 13, 14, 22)]

data_ord_base <- data_ord
data_ord_base$Covid_Confirmed_Cases_as_pct <- 100*data_ord_base$Covid_Confirmed_Cases_as_pct
data_ord_base$Always_Wear_Mask_Survey <- 100*data_ord_base$Always_Wear_Mask_Survey

data_ord_base1 <- data_ord_base
data_ord_base <- data_ord_base1[,-1]
rownames(data_ord_base) <- data_ord_base1[,1] # rownames are county names

dim(data_ord_base)
## [1] 67  5
head(data_ord_base)
##          Always_Wear_Mask_Survey
## Alachua                     75.1
## Baker                       44.2
## Bay                         54.8
## Bradford                    38.1
## Brevard                     60.7
## Broward                     79.1
##          Median_Household_Income_Percent_of_State_Total_2019
## Alachua                                                 84.3
## Baker                                                  102.1
## Bay                                                     98.6
## Bradford                                                80.3
## Brevard                                                 97.3
## Broward                                                103.8
##          Percent_Poverty_2019 Percent_Adults_Less_Than_HS
## Alachua                  18.4                         7.6
## Baker                    14.9                        15.5
## Bay                      12.1                         9.7
## Bradford                 21.0                        21.7
## Brevard                   9.4                         8.0
## Broward                  12.3                        11.2
##          Covid_Confirmed_Cases_as_pct
## Alachua                      7.984597
## Baker                       10.838754
## Bay                         10.043788
## Bradford                     9.712422
## Brevard                      5.189537
## Broward                      9.243293
plot(data_ord_base)

# Enviromental variables dataset - data_ord_env
data_ord_env_county <- raw[raw$State_Name=="Florida", ]
data_ord_env_county <- data_ord_env_county[, c(2, 10, 18, 19, 25, 15)]

data_ord_env <- data_ord_env_county
data_ord_env$Civilian_Labor_Force_2019_as_pct <- 100*data_ord_env$Civilian_Labor_Force_2019_as_pct

data_ord_env1 <- data_ord_env
data_ord_env <- data_ord_env1[,-1]
rownames(data_ord_env) <- data_ord_env1[,1] # rownames are county names

dim(data_ord_env)
## [1] 67  5
head(data_ord_env)
##          Unemployment_Rate_2019 Death_Rate_2019 Birth_Rate_2019
## Alachua                     2.9             7.7            10.3
## Baker                       3.1             9.5            11.2
## Bay                         3.9            11.0            12.3
## Bradford                    3.1            12.5            10.2
## Brevard                     3.2            12.5             8.8
## Broward                     3.0             8.2            11.2
##          Civilian_Labor_Force_2019_as_pct Percent_Adults_Bachelors_or_Higher
## Alachua                          51.74526                               42.5
## Baker                            40.57515                               13.4
## Bay                              48.21556                               22.8
## Bradford                         39.22556                               10.6
## Brevard                          47.19508                               29.3
## Broward                          53.28404                               31.9

1

Fit Correspondence Analysis to your data.

All columns of data_ord_base contains the variable data. Correspondence analysis is performed using the cca() function.

# No negative data anyways
# data_ord_base <- data_ord_base[apply(data_ord_base, 1, sum) > 0, ]

#Perform correspondence analysis
data_ord_base_ca <- cca(data_ord_base)

# inertia is measure of departure from ind model; if no relationships from rows and columns; in this case, household income, percent poverty are related; other variables are not so strongly related; if inertia smaller - less structure to data in 5-D 

2

Discuss the inertia, make a two dimensional plot of the first two CA directions.

summary(data_ord_base_ca)
## 
## Call:
## cca(X = data_ord_base) 
## 
## Partitioning of scaled Chi-square:
##               Inertia Proportion
## Total         0.05033          1
## Unconstrained 0.05033          1
## 
## Eigenvalues, and their contribution to the scaled Chi-square 
## 
## Importance of components:
##                           CA1      CA2      CA3      CA4
## Eigenvalue            0.03606 0.009955 0.002521 0.001793
## Proportion Explained  0.71649 0.197808 0.050087 0.035616
## Cumulative Proportion 0.71649 0.914297 0.964384 1.000000
## 
## Scaling 2 for species and site scores
## * Species are scaled proportional to eigenvalues
## * Sites are unscaled: weighted dispersion equal on all dimensions
## 
## 
## Species scores
## 
##                                                          CA1      CA2       CA3
## Always_Wear_Mask_Survey                             -0.04389 -0.14340  0.002602
## Median_Household_Income_Percent_of_State_Total_2019 -0.12367  0.07293 -0.005506
## Percent_Poverty_2019                                 0.36607  0.01375  0.119450
## Percent_Adults_Less_Than_HS                          0.43620  0.01442 -0.127072
## Covid_Confirmed_Cases_as_pct                         0.23445  0.14590  0.048594
##                                                          CA4
## Always_Wear_Mask_Survey                              0.01398
## Median_Household_Income_Percent_of_State_Total_2019 -0.01211
## Percent_Poverty_2019                                -0.06152
## Percent_Adults_Less_Than_HS                         -0.01670
## Covid_Confirmed_Cases_as_pct                         0.17146
## 
## 
## Site scores (weighted averages of species scores)
## 
##                     CA1      CA2      CA3       CA4
## Alachua      -0.2579870 -1.60706  2.77217  0.400127
## Baker         0.0280475  1.66967 -0.23324  0.190358
## Bay          -0.5365473  0.59860  0.64355  1.166522
## Bradford      1.2707134  1.41794 -0.27880 -1.401618
## Brevard      -1.0058699 -0.33752 -0.04251 -0.470399
## Broward      -0.6113475 -0.97532  0.23822  1.265977
## Calhoun       1.3900689  0.02428 -0.16307  0.484268
## Charlotte    -0.6966676 -0.74566  0.09810 -0.536160
## Citrus       -0.1394691 -0.75200  0.65413 -0.816689
## Clay         -1.2865849  1.56298 -0.68961 -0.642613
## Collier      -0.9516428  0.62048 -1.37787 -0.424347
## Columbia      0.4255024  1.39936  0.85239  0.584438
## DeSoto        1.4124015 -1.54893 -1.23015 -0.215095
## Dixie         1.5911653  0.66090  0.41171 -1.853031
## Duval        -0.4777867 -0.10803  0.67291  0.565342
## Escambia     -0.1999783  0.46290  1.84232  0.884221
## Flagler      -1.1162511  0.02829 -0.11021 -1.040040
## Franklin      1.0293941  1.18179 -0.08849 -0.391357
## Gadsden       1.1331068 -0.38765  0.06956  1.041970
## Gilchrist     0.2923450  0.49907  0.04248 -1.120320
## Glades        0.9602104 -1.33106 -1.49877 -1.420773
## Gulf          0.2524635  1.27854  0.02130  2.094265
## Hamilton      2.4514005  1.14909  2.54929 -2.856752
## Hardee        1.6243491  0.18401 -0.48894 -0.778705
## Hendry        1.6425587 -0.76661 -3.48737 -0.265267
## Hernando     -0.2998987 -0.60987 -0.40381 -0.858329
## Highlands     0.2519377 -1.18798 -0.01501 -0.567841
## Hillsborough -0.5718720 -0.69371  0.19420 -0.141031
## Holmes        1.4478480 -0.62345 -0.36468  0.727175
## Indian River -0.6894247  0.04277 -0.33385 -0.587752
## Jackson       1.2323901 -0.11028  0.52615  1.685731
## Jefferson     0.5199624 -1.08804  0.05242  0.654107
## Lafayette     1.5911661  1.87293 -0.72446  3.927492
## Lake         -0.6913000 -0.39668 -0.26513 -0.220985
## Lee          -0.6743885 -0.12695 -0.50324 -0.126535
## Leon         -0.2733104 -0.75881  3.57284  0.209794
## Levy          0.6182007 -1.17123  0.61102 -0.978715
## Liberty       1.2580903  0.50200  1.53857  0.360751
## Madison       1.7895438  1.25426  1.29302 -1.010829
## Manatee      -0.7719348  0.15722 -0.19547 -0.142531
## Marion        0.0007508 -0.19202  0.44339 -0.551968
## Martin       -1.1684693 -0.20071 -0.58139 -0.246356
## Miami-Dade    0.2996775 -0.69843 -0.18127  2.816490
## Monroe       -1.0677179 -0.94099  0.03983  0.703280
## Nassau       -1.1748135  1.88767 -0.34885 -0.333700
## Okaloosa     -0.9045129  1.42972  0.35211 -0.007087
## Okeechobee    0.9644400 -1.46129 -1.64082 -0.286844
## Orange       -0.6481869 -0.54586  0.03647  0.195285
## Osceola      -0.1761811 -1.10587  0.15055  1.364315
## Palm Beach   -0.7863143 -0.73505 -0.34040  0.290999
## Pasco        -0.6495546 -1.12949 -0.22874 -0.211219
## Pinellas     -0.7857529 -1.30050  0.41633  0.359708
## Polk         -0.0173068 -0.63077 -0.47020 -0.120923
## Putnam        1.0882213 -0.83097  0.84041 -1.396901
## St. Johns    -1.8246946  1.91832 -0.46595 -0.771806
## St. Lucie    -0.4973983 -0.51158 -1.18380 -0.222083
## Santa Rosa   -0.9793112  1.73872  0.06881  0.120004
## Sarasota     -1.3034658 -0.42690 -0.25311  0.107948
## Seminole     -1.4009072 -0.32091  0.34367 -0.478166
## Sumter       -1.1177304 -0.06506 -0.32542 -0.421412
## Suwannee      0.8290661  1.75865 -0.30903  0.183506
## Taylor        1.4076618  0.46177 -0.43854  0.710780
## Union         1.0743577  0.92255 -0.77718 -0.237763
## Volusia      -0.5157772 -0.59935  0.52913 -0.612641
## Wakulla      -0.3710730  1.56935 -0.22170  0.039781
## Walton       -0.6230454  1.06728 -0.38047  0.383858
## Washington    1.0322373 -0.42952  0.54986  0.195862

Inertia (equal to squared eigenvalues) is like variance and measures departures from the independence model. We see that the inertia value is 0.05033. The magnitude of inertia does not reflect more or less variance per se; it is reflective of the magnitude of the data. (Note that multiplying fractions by 100 to make values as percents did not increase this inertia magnitude).

In the “Proportion Explained” row, we can see that first CA direction explains 0.71649 (~72%) of the relation. The “Cumulative Proportion” by the second CA direction is 0.914297; hence, the first and second CA directions explain the vast majority of total inertia. The third and fourth CA directions have significantly smaller “Proportion Explained” values. This suggests that there are likely two major underlying discriminatory dimensions captured by the data.

#plot results
plot(data_ord_base_ca, main = "Correspondence Analysis for FL Counties", type = "n")
text(data_ord_base_ca, dis = "wa", labels = rownames(data_ord_base))
points(data_ord_base_ca, pch = 21, col = "red", bg = "yellow", cex = 1.2)
text(data_ord_base_ca, "species", col = "blue", cex = 0.8)

Add environmental variables.

plot(data_ord_base_ca, main = "Correspondence Analysis for FL Counties", type = "n")
points(data_ord_base_ca, pch = 19, col = "black", cex = 1)
text(data_ord_base_ca, "species", col = "blue", cex = 1.1)
#add environmental variables
fit <- envfit(data_ord_base_ca, data_ord_env, permutations = 1000)
plot(fit, col = "red", lwd = 3)

#get significance - all environmental variables are significant
fit
## 
## ***VECTORS
## 
##                                         CA1      CA2     r2   Pr(>r)    
## Unemployment_Rate_2019              0.93609 -0.35176 0.2400 0.000999 ***
## Death_Rate_2019                     0.61828 -0.78596 0.0084 0.793207    
## Birth_Rate_2019                     0.99605  0.08877 0.0450 0.246753    
## Civilian_Labor_Force_2019_as_pct   -0.97424 -0.22551 0.4110 0.000999 ***
## Percent_Adults_Bachelors_or_Higher -0.99468 -0.10305 0.6962 0.000999 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## Permutation: free
## Number of permutations: 1000

We can see that all environmental variables are significant (p < 0.05) except Birth_Rate_2019 and Death_Rate_2019. We will omit these variables from the environmental variable dataset for future analyses.

data_ord_env <- subset(data_ord_env, select=-2)
data_ord_env <- subset(data_ord_env, select=-2)

This plot is somewhat hard to read, so we try detrended correspondence analysis. This is even harder to read. DCA uses the decorana() function.

#detrended correspondence analysis
data_ord_base_dca <- decorana(data_ord_base)
plot(data_ord_base_dca, main = "DCA for Rural-Urban Type", type = "n")
text(data_ord_base_dca, display = c("sites"), labels = rownames(data_ord_base), cex = 0.86)
points(data_ord_base_dca, pch = 21, col = "red", bg = "yellow", cex = 0.6)
text(data_ord_base_dca, "species", col = "blue", cex = 0.6)

#add environmental variables
fit <- envfit(data_ord_base_dca, data_ord_env, permutations = 1000)
plot(fit, col = "red", lwd = 3)

3

Comment on whether or not there is any evidence of ‘data snaking’ in higher dimensional space.

pc1 <- princomp(data_ord_env, cor=TRUE)

source("http://reuningscherer.net/multivariate/r/ciscoreplot.R.txt")
ciscoreplot(pc1,c(1,2),data_ord_env[,1])
## Warning in sqrt((5.99 - (y1vec^2)/x$sdev[comps[1]]^2) * x$sdev[comps[2]]^2):
## NaNs produced

## Warning in sqrt((5.99 - (y1vec^2)/x$sdev[comps[1]]^2) * x$sdev[comps[2]]^2):
## NaNs produced

There is no evidence of data snaking in higher dimensional space. Evidence of snaking would be a PCA score plot that looks like a horseshoe. However, the above scoreplot appears random and therefore does not indicate data snaking.

4

In a few sentences, describe what you conclude from your plot.

From our first plot in (2) of the first two CA directions, we should be able to find which counties are similar and what are the columns on which they are similar. Overall, the counties seem evenly and randomly scattered between the 4 quadrants- we do not note rows near columns, so there is not association not accounted for by the independence model. Generally, the first correspondence axis is associated with low income and low education and high COVID-19 rates, while the second correspondence axis is associated primarily with poor masking behaviors and high COVID-19 rates, perhaps indicating two different types of counties that are associated with high COVID-19 rates (those in poorer, disadvantaged areas and also those with poor masking behaviors). As one may expect, the percent poverty and percent of adults with less than a high school degree point in the same direction, while the median household income points in the opposite direction.

5

Perform Multidimensional Scaling (metric or non-metric) for 1, 2, and 3 dimensions.

results <- matrix(NA, 21, 5)
#j is number of dimensions to try
for (j in 1:5){
  for (i in 1:20){
    temp <- data_ord_base[shuffle(nrow(data_ord_base)), 1]
    for (k in 2:12) { temp <- cbind(temp, data_ord_base[shuffle(nrow(data_ord_base)), k]) }
    #store stress
    results[i, j] <- metaMDS(temp, k = j, distance = "euclidean")$stress
  }
  results[21, j] <- metaMDS(data_ord_base, k = j, distance = "euclidean")$stress
}

# Note: results are hidden (too long)

6

Discuss the stress (or SStress) of each dimensional solution. Make a scree plot if you’re able.

#plot stress results

plot(c(1:5), results[21, ], type = "b", col = "blue", lwd = 3, 
     ylim = c(0, max(results)), xlab = "Dimensions", ylab = "Stress", pch = 19, 
     main = "MDS for Rural-Urban Data, Euclidean Distance")
mins <- apply(results[1:20, ], 2, min)
maxs <- apply(results[1:20, ], 2, max)
meds <- apply(results[1:20, ], 2, median)

for (i in 1:5){
  points(rep(i, 3), c(mins[i], meds[i], maxs[i]), type = "b", col = "red", lwd = 3, pch = 19)
}
legend(3.5, (.9*max(results)), c("MDS Solution", "20 Permutations"), lwd = 3, col = c("blue", "red"))

After performing multidimensional scaling for 1-5 dimensions, the above scree plot for stress illustrates an elbow at 2 dimensions. This stress level is below 10% and indicates a good fit. For 3 dimensions, the stress is below 5% and indicates an excellent fit. After 3 dimensions, random chance could result in comparable stress values.

Stress is a measure of the difference between actual pairwise distances and calculated reference distances; a lower stress indicates a better fit. As the dimensions exceeds that of the data (for 4 and 5 dimensions), the stress goes to 0.

7

Make a two dimensional plot of your MDS results.

data_ord_base.mds2 <- metaMDS(data_ord_base, k = 2, distance = "euclidean")
## Square root transformation
## Wisconsin double standardization
## Run 0 stress 0.0762502 
## Run 1 stress 0.08194923 
## Run 2 stress 0.0861957 
## Run 3 stress 0.07130048 
## ... New best solution
## ... Procrustes: rmse 0.03267128  max resid 0.2604236 
## Run 4 stress 0.07625021 
## Run 5 stress 0.07130039 
## ... New best solution
## ... Procrustes: rmse 3.250616e-05  max resid 0.0001380985 
## ... Similar to previous best
## Run 6 stress 0.07130042 
## ... Procrustes: rmse 3.790379e-05  max resid 0.0001629308 
## ... Similar to previous best
## Run 7 stress 0.07130088 
## ... Procrustes: rmse 0.0001630259  max resid 0.0009735512 
## ... Similar to previous best
## Run 8 stress 0.07130039 
## ... Procrustes: rmse 4.144036e-05  max resid 0.0002082925 
## ... Similar to previous best
## Run 9 stress 0.1132729 
## Run 10 stress 0.08194625 
## Run 11 stress 0.0861957 
## Run 12 stress 0.07130038 
## ... New best solution
## ... Procrustes: rmse 2.054961e-05  max resid 0.0001042096 
## ... Similar to previous best
## Run 13 stress 0.08194625 
## Run 14 stress 0.0861437 
## Run 15 stress 0.07625021 
## Run 16 stress 0.1306309 
## Run 17 stress 0.07625021 
## Run 18 stress 0.0713005 
## ... Procrustes: rmse 5.373399e-05  max resid 0.0002521113 
## ... Similar to previous best
## Run 19 stress 0.07130041 
## ... Procrustes: rmse 5.1395e-05  max resid 0.000269991 
## ... Similar to previous best
## Run 20 stress 0.07130037 
## ... New best solution
## ... Procrustes: rmse 1.996309e-05  max resid 0.0001228241 
## ... Similar to previous best
## *** Solution reached
plot(data_ord_base.mds2, type = "t")

stressplot(data_ord_base.mds2)

The R-squared values seem sufficiently high with the two dimensional MDS result.

8

If possible, overlay some other continuous variable(s) to interpret your ordination axes. Calculate p-values for the overlaid additional variable(s). If you can, get some non-linear wireplots of the these overlaid variables (see examples online in R).

We can also add environmental variables to our plot.

fig <- ordiplot(data_ord_base.mds2, type = "none", cex = 1.1)
text(fig, "species", col = "red", cex = 1.1)
text(fig, "sites", col = "blue", cex = 0.8)

fit <- envfit(data_ord_base.mds2, data_ord_env, permutations = 1000)
plot(fit, col = "black", lwd = 3)

fit <- envfit(data_ord_base_ca, data_ord_env, permutations = 1000)
fit
## 
## ***VECTORS
## 
##                                         CA1      CA2     r2   Pr(>r)    
## Unemployment_Rate_2019              0.93609 -0.35176 0.2400 0.000999 ***
## Civilian_Labor_Force_2019_as_pct   -0.97424 -0.22551 0.4110 0.000999 ***
## Percent_Adults_Bachelors_or_Higher -0.99468 -0.10305 0.6962 0.000999 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## Permutation: free
## Number of permutations: 1000

The three overlaid continuous variables above are all significant with p<0.05. This is graphically suggested by the long length of the lines.

mds4 <- metaMDS(data_ord_env, distance="euclidean", k=4)
## Square root transformation
## Wisconsin double standardization
## Run 0 stress 0 
## Run 1 stress 0.001162408 
## Run 2 stress 0.0006615713 
## Run 3 stress 0.0007606281 
## Run 4 stress 0.0009378631 
## Run 5 stress 0.0006934562 
## Run 6 stress 0.0005385021 
## Run 7 stress 0.0008085908 
## Run 8 stress 0.0008565678 
## Run 9 stress 0.001000305 
## Run 10 stress 0.0009884766 
## Run 11 stress 0.0007089045 
## Run 12 stress 0.000779397 
## Run 13 stress 0.0007901714 
## Run 14 stress 0.001251475 
## Run 15 stress 0.0006856797 
## Run 16 stress 0.0007331602 
## Run 17 stress 0.0006072866 
## Run 18 stress 0.0008370989 
## Run 19 stress 0.0007905529 
## Run 20 stress 0.0006026403 
## *** No convergence -- monoMDS stopping criteria:
##     20: no. of iterations >= maxit
## Warning in metaMDS(data_ord_env, distance = "euclidean", k = 4): stress is
## (nearly) zero: you may have insufficient data
fig <- ordiplot(mds4, type = "none", cex = 1.1, main = "NMDS for COVID-19 Data")
text(fig, "species", col = "red", cex = 0.7)
text(fig, "sites", col = "black", cex = 0.7)
plot(fit)
tmp1 <- with(data_ord_env, ordisurf(mds4, Unemployment_Rate_2019, add = TRUE))
tmp2 <- with(data_ord_env, ordisurf(mds4, Percent_Adults_Bachelors_or_Higher, add = TRUE, col = "green4"))
tmp3 <- with(data_ord_env, ordisurf(mds4, Civilian_Labor_Force_2019_as_pct, add = TRUE, col = "purple"))

vis.gam(tmp1, main = "Unemployment Rate")

vis.gam(tmp2, main = "Percentage of Adults with Bachelor's or Higher")

vis.gam(tmp3, main = "Civilian Labor Force Percentage")

9

Again, assuming you have at least one additional continuous variable, perform canonical correspondence analysis.

As directed, here we’ll perform CCA – both with and without (or the other way around) the environmental variables.

data_ord_base_cca1 <- cca(data_ord_base, scale="FALSE")
data_ord_base_cca2 <- cca(data_ord_base, data_ord_env, scale="FALSE")
plot(data_ord_base_cca1, main="CCA without env")

plot(data_ord_base_cca2, main="CCA with env")

#plot(data_ord_base_cca, main = "CCA for Rural-Urban Type", type = "n")
#points(data_ord_base_cca, pch = 19, col = "red", cex = 1)
#text(data_ord_base_cca, "species", col = "blue", cex = 0.7)
#text(data_ord_base_cca, display = c("sites"), labels = rownames(data_ord_base), cex = 0.5)

(fit_cca <- envfit(data_ord_base_cca2, data_ord_env, permutations=1000))
## 
## ***VECTORS
## 
##                                         CCA1      CCA2     r2   Pr(>r)    
## Unemployment_Rate_2019              0.977760 -0.209739 0.2553 0.000999 ***
## Civilian_Labor_Force_2019_as_pct   -0.997160 -0.075256 0.4131 0.000999 ***
## Percent_Adults_Bachelors_or_Higher -0.999830 -0.018480 0.7048 0.000999 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## Permutation: free
## Number of permutations: 1000
plot(data_ord_base_cca2)
plot(fit_cca, col = "red", lwd = 3)

summary(data_ord_base_cca2)
## 
## Call:
## cca(X = data_ord_base, Y = data_ord_env, scale = "FALSE") 
## 
## Partitioning of scaled Chi-square:
##               Inertia Proportion
## Total         0.05033     1.0000
## Constrained   0.02673     0.5312
## Unconstrained 0.02360     0.4688
## 
## Eigenvalues, and their contribution to the scaled Chi-square 
## 
## Importance of components:
##                          CCA1     CCA2      CCA3     CA1     CA2      CA3
## Eigenvalue            0.02517 0.001433 0.0001286 0.01225 0.00795 0.002016
## Proportion Explained  0.50014 0.028465 0.0025542 0.24348 0.15796 0.040061
## Cumulative Proportion 0.50014 0.528601 0.5311551 0.77463 0.93259 0.972652
##                            CA4
## Eigenvalue            0.001376
## Proportion Explained  0.027348
## Cumulative Proportion 1.000000
## 
## Accumulated constrained eigenvalues
## Importance of components:
##                          CCA1     CCA2      CCA3
## Eigenvalue            0.02517 0.001433 0.0001286
## Proportion Explained  0.94160 0.053590 0.0048088
## Cumulative Proportion 0.94160 0.995191 1.0000000
## 
## Scaling 2 for species and site scores
## * Species are scaled proportional to eigenvalues
## * Sites are unscaled: weighted dispersion equal on all dimensions
## 
## 
## Species scores
## 
##                                                         CCA1     CCA2      CCA3
## Always_Wear_Mask_Survey                             -0.05278 -0.05184  0.004032
## Median_Household_Income_Percent_of_State_Total_2019 -0.09263  0.02673 -0.003443
## Percent_Poverty_2019                                 0.28361  0.01487 -0.020196
## Percent_Adults_Less_Than_HS                          0.39865 -0.01652  0.001006
## Covid_Confirmed_Cases_as_pct                         0.17152  0.06858  0.043256
##                                                          CA1      CA2       CA3
## Always_Wear_Mask_Survey                             -0.03731  0.12488 -0.002997
## Median_Household_Income_Percent_of_State_Total_2019  0.09936 -0.03903  0.005827
## Percent_Poverty_2019                                -0.22620 -0.08500  0.111714
## Percent_Adults_Less_Than_HS                         -0.16894 -0.05727 -0.102570
## Covid_Confirmed_Cases_as_pct                        -0.12422 -0.18453 -0.063076
## 
## 
## Site scores (weighted averages of species scores)
## 
##                  CCA1     CCA2      CCA3      CA1       CA2      CA3
## Alachua      -0.44270 -3.40786  -0.24161 -2.85980  0.330773  1.25210
## Baker         0.10024  4.26789   0.42300  1.55293 -0.777995  0.29274
## Bay          -0.64438  1.89645   3.41489  1.04832 -1.052120  0.56516
## Bradford      1.58692  3.23318  -4.78274 -0.08183 -1.114071  0.46772
## Brevard      -1.20371 -0.70458  -2.04920  0.64061  0.584208  0.35922
## Broward      -0.78323 -2.24673   4.48190 -0.56799  0.490985 -0.42667
## Calhoun       1.65422 -0.23416   2.50777 -0.54201 -0.193127 -0.38618
## Charlotte    -0.85407 -1.79553  -2.14568  0.79143  1.385045  0.58618
## Citrus       -0.20269 -1.81547  -3.34039  0.73548  0.866036  1.24778
## Clay         -1.44785  4.10021  -2.85479  2.36989 -0.826960  0.41909
## Collier      -1.06933  1.44385  -1.13605  0.10109 -0.693334 -1.60439
## Columbia      0.53112  3.77349   1.32211  0.49487 -1.132029  0.77115
## DeSoto        1.65614 -4.61067   1.16665 -1.24409  1.679176 -1.08893
## Dixie         1.92664  1.34252  -6.60205 -0.62286 -0.748921  1.16633
## Duval        -0.59565  0.02211   1.40287 -0.13757 -0.465944  0.39331
## Escambia     -0.27687  1.74232   1.67822 -0.22159 -0.703872  1.20944
## Flagler      -1.31372  0.20796  -4.24471  1.62497  0.553084  0.85203
## Franklin      1.27565  2.79012  -1.24754 -0.56045 -1.369445 -0.23333
## Gadsden       1.32128 -1.15008   4.38857 -1.08992 -0.501499 -0.58992
## Gilchrist     0.37703  1.16008  -4.16196  0.72436  0.130178  0.93945
## Glades        1.14550 -4.10171  -3.33032 -0.12684  1.473841 -0.30031
## Gulf          0.32654  3.40464   7.44655  0.81659 -2.320638 -0.66988
## Hamilton      2.92405  2.86542 -11.62673 -1.93242 -1.893437  3.02394
## Hardee        1.96030 -0.03744  -1.83912 -0.69453 -1.048443 -0.31393
## Hendry        2.01789 -3.18102   2.53173 -0.30451 -0.804561 -2.95680
## Hernando     -0.36066 -1.67076  -2.82344  1.06470  0.998128  0.50630
## Highlands     0.26128 -3.16753  -1.63842  0.06873  1.183650  0.38727
## Hillsborough -0.71013 -1.63661  -0.72228 -0.59068  0.236228  0.02218
## Holmes        1.70129 -1.95197   3.75463 -0.98807  0.815789 -0.73717
## Indian River -0.80417  0.13399  -2.23177  0.48561 -0.003610 -0.15747
## Jackson       1.43155 -0.29918   6.38724 -0.94451 -0.007011 -0.43450
## Jefferson     0.56973 -2.86330   2.89773 -1.17235  1.130851 -0.60445
## Lafayette     1.94225  4.61041  15.16637 -1.27467 -1.769550 -3.32407
## Lake         -0.82846 -0.95936  -0.81247  0.92437  1.089037  0.26855
## Lee          -0.79304 -0.31602  -0.35654  0.36711  0.453950 -0.39778
## Leon         -0.44762 -1.03335  -1.77355 -2.97313 -1.020685  1.87070
## Levy          0.68588 -3.08008  -3.45481 -0.10012  1.332323  1.50243
## Liberty       1.47275  1.42754   0.60829 -0.79446 -0.277589  0.99307
## Madison       2.15379  3.10521  -4.23025 -1.34032 -2.244500  1.29918
## Manatee      -0.90616  0.51169  -0.76392  0.39430  0.158330 -0.19494
## Marion       -0.01206 -0.42917  -2.30415  0.33564  0.456923  0.80712
## Martin       -1.38051 -0.43010  -0.93709  0.49063  0.525289 -0.53174
## Miami-Dade    0.30614 -1.71364  10.82034 -1.67193  0.359637 -1.97126
## Monroe       -1.31414 -2.14927   2.34747 -0.07716  0.763749  0.07965
## Nassau       -1.31454  5.01949  -2.01014  1.71006 -1.252636  0.07229
## Okaloosa     -1.03147  3.96982  -1.07972  0.72880 -0.963809  0.20686
## Okeechobee    1.13767 -4.39348   0.98386 -0.43563  1.593806 -1.01245
## Orange       -0.79477 -1.25250   0.55677 -0.56422 -0.031170 -0.22220
## Osceola      -0.26909 -2.68968   5.13622  0.14793  1.049223  0.03052
## Palm Beach   -0.95756 -1.79612   1.17823 -0.58413  0.254155 -0.87391
## Pasco        -0.80740 -2.85309  -0.58236  0.59090  1.567402  0.24828
## Pinellas     -0.99830 -3.07182   1.02956 -0.29765  1.202111  0.27183
## Polk         -0.03103 -1.74815   0.06866  0.23412  0.697029 -0.14035
## Putnam        1.25636 -2.27366  -5.04933 -0.62878  0.492432  1.52447
## St. Johns    -2.07921  5.17633  -3.83043  0.81310 -1.728165 -0.66101
## St. Lucie    -0.57847 -1.51729  -0.02183  1.10796  0.643239 -0.43374
## Santa Rosa   -1.10287  4.72538  -0.52166  1.32545 -1.239094  0.10723
## Sarasota     -1.56170 -0.88287   0.12730  0.35835  0.902196 -0.52889
## Seminole     -1.68298 -0.48667  -2.54127  0.12969  0.054256  0.43293
## Sumter       -1.31980 -0.04108  -1.78500  0.78670  0.750345 -0.72767
## Suwannee      1.05876  4.31106   0.78212  0.23530 -1.765324 -0.34052
## Taylor        1.69637  0.84303   3.43101 -0.16838 -0.384565 -0.46401
## Union         1.33550  1.95629  -0.08314  0.26188  0.379401 -0.54734
## Volusia      -0.64360 -1.35740  -2.70563  0.47984  0.739835  1.14598
## Wakulla      -0.37721  4.08455  -0.28765  1.52163 -0.774887  0.30464
## Walton       -0.69538  2.82575   1.12300  0.48687 -0.468831 -0.75593
## Washington    1.19601 -1.18162   0.88880 -0.50963  0.689445  0.59098
## 
## 
## Site constraints (linear combinations of constraining variables)
## 
##                   CCA1      CCA2      CCA3      CA1       CA2      CA3
## Alachua      -2.007956 -0.002997 -0.399842 -2.85980  0.330773  1.25210
## Baker         0.922848  0.890041  0.902784  1.55293 -0.777995  0.29274
## Bay          -0.084847 -1.772954  0.592189  1.04832 -1.052120  0.56516
## Bradford      1.207997  1.058849  0.995538 -0.08183 -1.114071  0.46772
## Brevard      -0.688836 -0.098129  0.142918  0.64061  0.584208  0.35922
## Broward      -0.995305 -0.641658  0.970594 -0.56799  0.490985 -0.42667
## Calhoun       1.299548 -0.122155 -0.382269 -0.54201 -0.193127 -0.38618
## Charlotte    -0.020235  0.283009 -0.982653  0.79143  1.385045  0.58618
## Citrus        0.557753 -0.969861 -2.092434  0.73548  0.866036  1.24778
## Clay         -0.197145 -0.107359  1.164709  2.36989 -0.826960  0.41909
## Collier      -1.354230  0.267885 -0.607522  0.10109 -0.693334 -1.60439
## Columbia      0.696933  0.569731  0.755388  0.49487 -1.132029  0.77115
## DeSoto        1.138442  0.585270  0.344483 -1.24409  1.679176 -1.08893
## Dixie         1.351199  0.389223 -0.060075 -0.62286 -0.748921  1.16633
## Duval        -0.721987 -0.975451  0.769796 -0.13757 -0.465944  0.39331
## Escambia     -0.393925  0.069627  0.246089 -0.22159 -0.703872  1.20944
## Flagler      -0.166712 -0.352844 -0.719791  1.62497  0.553084  0.85203
## Franklin      0.574643  0.521513 -0.125193 -0.56045 -1.369445 -0.23333
## Gadsden       0.570257 -1.149294  0.011786 -1.08992 -0.501499 -0.58992
## Gilchrist     0.858024  0.652154  0.278065  0.72436  0.130178  0.93945
## Glades        1.258029 -0.506087  0.528679 -0.12684  1.473841 -0.30031
## Gulf          0.444904 -2.698311 -0.367362  0.81659 -2.320638 -0.66988
## Hamilton      1.397363  0.529928 -0.980435 -1.93242 -1.893437  3.02394
## Hardee        1.253258 -1.481501 -1.430620 -0.69453 -1.048443 -0.31393
## Hendry        1.393228 -4.009958 -0.790186 -0.30451 -0.804561 -2.95680
## Hernando      0.530182 -0.659333 -0.817613  1.06470  0.998128  0.50630
## Highlands     0.601215 -0.760695 -1.457619  0.06873  1.183650  0.38727
## Hillsborough -1.053212 -0.450856  0.441390 -0.59068  0.236228  0.02218
## Holmes        1.263857  0.876267  0.155464 -0.98807  0.815789 -0.73717
## Indian River -0.546316 -0.293234 -1.144739  0.48561 -0.003610 -0.15747
## Jackson       0.945306  0.485231 -0.005222 -0.94451 -0.007011 -0.43450
## Jefferson     0.092725  0.825488 -0.363288 -1.17235  1.130851 -0.60445
## Lafayette     0.756734  2.215306 -0.388922 -1.27467 -1.769550 -3.32407
## Lake         -0.009966  0.412752  0.248184  0.92437  1.089037  0.26855
## Lee          -0.520889  0.403451  0.042881  0.36711  0.453950 -0.39778
## Leon         -2.335794 -0.541335 -0.684923 -2.97313 -1.020685  1.87070
## Levy          0.988017 -0.273134  0.567823 -0.10012  1.332323  1.50243
## Liberty       1.065071  1.340765 -0.798182 -0.79446 -0.277589  0.99307
## Madison       0.867993 -0.465183  0.385345 -1.34032 -2.244500  1.29918
## Manatee      -0.661535  0.516213 -0.210707  0.39430  0.158330 -0.19494
## Marion        0.350330  0.080264 -0.686281  0.33564  0.456923  0.80712
## Martin       -1.034645  0.355132 -0.395475  0.49063  0.525289 -0.53174
## Miami-Dade   -0.656128  0.877023  1.313950 -1.67193  0.359637 -1.97126
## Monroe       -1.202339 -0.400639  2.963260 -0.07716  0.763749  0.07965
## Nassau       -0.567587  0.591939  0.341978  1.71006 -1.252636  0.07229
## Okaloosa     -0.790901  1.113859  0.119242  0.72880 -0.963809  0.20686
## Okeechobee    1.091179 -0.207662  1.140160 -0.43563  1.593806 -1.01245
## Orange       -1.191455 -0.644449  1.058354 -0.56422 -0.031170 -0.22220
## Osceola       0.145779 -0.756823  1.364936  0.14793  1.049223  0.03052
## Palm Beach   -1.330926 -0.477918 -0.338012 -0.58413  0.254155 -0.87391
## Pasco        -0.082094 -0.112526 -0.078787  0.59090  1.567402  0.24828
## Pinellas     -0.882716 -0.236182  0.647977 -0.29765  1.202111  0.27183
## Polk          0.250653 -0.446754  0.049231  0.23412  0.697029 -0.14035
## Putnam        1.037210 -0.715389 -0.546727 -0.62878  0.492432  1.52447
## St. Johns    -2.100531  0.890748 -0.620499  0.81310 -1.728165 -0.66101
## St. Lucie     0.196923 -1.166346  0.217540  1.10796  0.643239 -0.43374
## Santa Rosa   -0.545711  0.637809  0.042468  1.32545 -1.239094  0.10723
## Sarasota     -1.176335  0.780044 -0.981018  0.35835  0.902196 -0.52889
## Seminole     -1.613426 -0.471502  0.472865  0.12969  0.054256  0.43293
## Sumter       -0.681182  0.808735 -4.615416  0.78670  0.750345 -0.72767
## Suwannee      0.788376  0.307791  0.576808  0.23530 -1.765324 -0.34052
## Taylor        1.516638  0.180899  0.861904 -0.16838 -0.384565 -0.46401
## Union         1.519029  2.642363 -0.115282  0.26188  0.379401 -0.54734
## Volusia      -0.128104 -0.398469  0.452265  0.47984  0.739835  1.14598
## Wakulla       0.406747  0.785360  1.065397  1.52163 -0.774887  0.30464
## Walton       -0.547063  1.422236 -0.247075  0.48687 -0.468831 -0.75593
## Washington    1.105397  0.681665  0.482510 -0.50963  0.689445  0.59098
## 
## 
## Biplot scores for constraining variables
## 
##                                       CCA1     CCA2     CCA3 CA1 CA2 CA3
## Unemployment_Rate_2019              0.5473 -0.60888 -0.57427   0   0   0
## Civilian_Labor_Force_2019_as_pct   -0.7554 -0.18958  0.62725   0   0   0
## Percent_Adults_Bachelors_or_Higher -0.9991  0.02401 -0.03542   0   0   0

The three overlaid continuous variables above are all significant with p<0.001. However, the continuous variables don’t have a great distribution on this plot, so the discriminating ability is probably not as helpful as what we might like.

10

Finally, write a paragraph or so comparing the methods you’ve used, discuss what conclusions you reach, etc.

The counties are well distributed across the quadrants in each of our CCA methods, and we find significance in three of our environmental variables. Our two-dimensional MDS results are robust and suggest two dimensions are likely optimal, although three dimensions could also be considered. Our results for canonical correspondence analysis (CCA) are somewhat concentrated and difficult to read. Although our relatively high amount of counties (67) contributes to the difficulty in discerning the plot, the CCA plot is particularly concentrated. We believe the NMDS plot in #8 with contour lines optimally illustrates the distribution of the counties and their relations to the NMDS axes and environmental variables. It conveys a lot of information in a single plot. Moreover, we can see that the contour lines are not exactly perpendicular to their respective blue dimensional axes, suggesting a more complex (non-linear) pattern in the environmental variables.

. We find some associations between low income/education rates and COVID-19 impact indicators, which is an entirely unsurprising and telling trend. MDS doesn’t do much to separate the education and labor force metrics, but envfit helps to separate them. Their proximity makes a lot of sense since one would expect them to trend together and possibly have synergistic value. In CCA, we find substantially more variation on the second CCA axis, which is inversely related to unemployment.